Task #1 - Set working directory & import packages
# Setting directory of script if running in RStudio
library(rstudioapi)
if(rstudioapi::isAvailable()){
path <- rstudioapi::getActiveDocumentContext()$path
Encoding(path) <- "UTF-8"
setwd(dirname(path))
}
# Load libraries
library(tidyverse)
library(gridExtra) # to plot multiple ggplot objects
library(scales)
library(lubridate)
library(plotly)
Task #2 - Merge the 12 months of sales data into a single CSV file
# specifies all csv files within our input folder
file_names <- list.files(
path = "input",
pattern = "*.csv",
full.names = TRUE)
for (i in file_names){
df <- read_csv((i))
assign(substr(i, 7, nchar(i)-4) , df)
rm(df)
}
# listing all the dataframes stored in our global environment
list_dataframes <- Filter(function(x) is(x, "data.frame"), mget(ls()))
# bind into a single dataframe
all_months_data <- do.call(rbind, list_dataframes)
# remove individual monthly sales dataframes (for cleanliness)
rm(list=ls(pattern="Sales_"))
Task #3 - Explore the merged dataset
# view first 8 rows
head(all_months_data, 8)
## # A tibble: 8 × 6
## `Order ID` Product `Quantity Order… `Price Each` `Order Date` `Purchase Addre…
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 176558 USB-C … 2 11.95 04/19/19 08… 917 1st St, Dal…
## 2 <NA> <NA> <NA> <NA> <NA> <NA>
## 3 176559 Bose S… 1 99.99 04/07/19 22… 682 Chestnut St…
## 4 176560 Google… 1 600 04/12/19 14… 669 Spruce St, …
## 5 176560 Wired … 1 11.99 04/12/19 14… 669 Spruce St, …
## 6 176561 Wired … 1 11.99 04/30/19 09… 333 8th St, Los…
## 7 176562 USB-C … 1 11.95 04/29/19 13… 381 Wilson St, …
## 8 176563 Bose S… 1 99.99 04/02/19 07… 668 Center St, …
# view structure of dataframe
str(all_months_data)
## spec_tbl_df [186,850 × 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Order ID : chr [1:186850] "176558" NA "176559" "176560" ...
## $ Product : chr [1:186850] "USB-C Charging Cable" NA "Bose SoundSport Headphones" "Google Phone" ...
## $ Quantity Ordered: chr [1:186850] "2" NA "1" "1" ...
## $ Price Each : chr [1:186850] "11.95" NA "99.99" "600" ...
## $ Order Date : chr [1:186850] "04/19/19 08:46" NA "04/07/19 22:30" "04/12/19 14:38" ...
## $ Purchase Address: chr [1:186850] "917 1st St, Dallas, TX 75001" NA "682 Chestnut St, Boston, MA 02215" "669 Spruce St, Los Angeles, CA 90001" ...
## - attr(*, "spec")=
## .. cols(
## .. `Order ID` = col_character(),
## .. Product = col_character(),
## .. `Quantity Ordered` = col_character(),
## .. `Price Each` = col_character(),
## .. `Order Date` = col_character(),
## .. `Purchase Address` = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
# count # of NA values in each column
colSums(is.na(all_months_data))
## Order ID Product Quantity Ordered Price Each
## 545 545 545 545
## Order Date Purchase Address
## 545 545
Task #4 - Data cleaning!
# Rename column headers & convert datatypes
all_months_data <- all_months_data %>%
rename(
Order_ID = `Order ID`,
Order_Date = `Order Date`,
Quantity_Ordered = `Quantity Ordered`,
Price_Each = `Price Each`,
Purchase_Address = `Purchase Address`) %>%
mutate(
Order_Date_Time = as.POSIXct(Order_Date, format="%m/%d/%y %H:%M", tz=Sys.timezone()),
Order_Date = as.Date(Order_Date, format = "%m/%d/%y"),
Quantity_Ordered = as.numeric(Quantity_Ordered),
Price_Each = as.numeric(Price_Each))
# Given that we've spotted a couple missing values in our dataset, we'll need to drop any NA values
all_months_data_droppedNAs <- na.omit(all_months_data)
# QA if we've dropped a maximum of 900 rows based on our data exploration discovery in task #3
nrow(all_months_data)-nrow(all_months_data_droppedNAs)
## [1] 903
Task #5 - Add new columns
all_months_data_droppedNAs <- all_months_data_droppedNAs %>%
mutate(
Month = format(Order_Date, "%m"),
Month = as.numeric(Month), # convert to numeric value
Sales = Price_Each * Quantity_Ordered,
Product_Category = case_when(
str_detect(Product, "Laptop") ~ "Laptop",
str_detect(Product, "Monitor|TV") ~ "Monitor/TV",
str_detect(Product, "Charging|Batteries") ~ "Charging Accessories",
str_detect(Product, "Headphones") ~ "Headphones",
str_detect(Product, "Phone") ~ "Smartphone",
str_detect(Product, "Washing|Dryer") ~ "Cleaning",
TRUE ~ "Other")
)
# Add a 'City' column by parsing the 'Purchase_Address' column
all_months_data_droppedNAs$City <- str_split_fixed(all_months_data_droppedNAs$Purchase_Address,",", 2)[,2]
all_months_data_droppedNAs$City <- str_split_fixed(all_months_data_droppedNAs$City,",", 2)[,1]
head(all_months_data_droppedNAs)
## # A tibble: 6 × 11
## Order_ID Product Quantity_Ordered Price_Each Order_Date Purchase_Address
## <chr> <chr> <dbl> <dbl> <date> <chr>
## 1 176558 USB-C Cha… 2 12.0 2019-04-19 917 1st St, Dallas…
## 2 176559 Bose Soun… 1 100. 2019-04-07 682 Chestnut St, B…
## 3 176560 Google Ph… 1 600 2019-04-12 669 Spruce St, Los…
## 4 176560 Wired Hea… 1 12.0 2019-04-12 669 Spruce St, Los…
## 5 176561 Wired Hea… 1 12.0 2019-04-30 333 8th St, Los An…
## 6 176562 USB-C Cha… 1 12.0 2019-04-29 381 Wilson St, San…
## # … with 5 more variables: Order_Date_Time <dttm>, Month <dbl>, Sales <dbl>,
## # Product_Category <chr>, City <chr>
Question 1: What was the best month for sales?
ggplotly(all_months_data_droppedNAs %>%
group_by(Month) %>%
summarise(Total_Sales = sum(Sales)) %>%
ungroup() %>%
ggplot(aes(x = as.factor(Month), y = Total_Sales)) +
geom_bar(stat = 'identity', fill = "royal blue") +
labs(
title = "December was the best month for sales",
x = "Month",
y = "Sales in USD ($ Millions)",
) +
theme_light() +
geom_text(aes(
label = paste0(dollar_format()(Total_Sales/1000000),"M")),
position = position_stack(vjust = 0.5),
size = 5,
angle = 90,
color = "white"))
Question 2: What city had the highest number of sales?
ggplotly(all_months_data_droppedNAs %>%
group_by(City) %>%
summarise(Total_Sales = sum(Sales)) %>%
ungroup() %>%
ggplot(aes(x = reorder(City, Total_Sales), y = Total_Sales)) +
geom_bar(stat = 'identity', fill = "royal blue") +
labs(
title = "San Francisco earns the highest number of sales",
x = "City",
y = "Sales in USD ($ Millions)",
) +
theme_light() +
coord_flip() +
geom_text(aes(
label = paste0(dollar_format()(Total_Sales/1000000),"M")),
position = position_stack(vjust = 0.5),
size = 5,
color = "white"))
Question 3: What’s the most popular product?
ggplotly(all_months_data_droppedNAs %>%
group_by(Product) %>%
summarise(Total_Quantity = sum(Quantity_Ordered)) %>%
ungroup() %>%
ggplot(aes(x = reorder(Product, Total_Quantity), y = Total_Quantity)) +
geom_bar(stat = 'identity', fill = "royal blue") +
coord_flip() +
theme_light() +
labs(
title = "Most popular product purchases are charging cables, batteries, and headphones",
x = "Product",
y = "Total Quantity Sold"
) +
geom_text(aes(
label = Total_Quantity),
position = position_stack(vjust = 0.5),
size = 5,
color = "white"))
Question 4: What products are most often sold together?
ggplotly(all_months_data_droppedNAs %>%
group_by(Order_ID) %>%
mutate(Order_items = paste0(Product, collapse = ", ")) %>%
summarise(
Order_items,
Total_Products = n_distinct(Product)) %>%
filter(Total_Products > 1) %>%
ungroup() %>%
group_by(Order_items) %>%
summarise(Order_Frequency = n_distinct(Order_ID)) %>%
ungroup() %>%
arrange(desc(Order_Frequency)) %>%
head(20) %>%
ggplot(aes(x = reorder(Order_items, Order_Frequency), y = Order_Frequency)) +
geom_bar(stat = 'identity', fill = "royal blue") +
coord_flip() +
labs(
title = "Smart Phones are typically purchased along with charging cables or audio accessories",
x = "List of Ordered Products",
y = "Order Frequency"
) +
theme_light() +
geom_text(aes(
label = Order_Frequency),
position = position_stack(vjust = 0.5),
size = 4,
color = "white"))
Question 5: Which product generated the highest revenue?
ggplotly(all_months_data_droppedNAs %>%
group_by(Product,Product_Category) %>%
summarise(Total_Sales = sum(Sales)) %>%
ungroup() %>%
ggplot(aes(x = reorder(Product, Total_Sales), y = Total_Sales, fill = Product_Category)) +
geom_bar(stat = 'identity') +
theme_light() +
coord_flip() +
labs(
title = "Laptops & Smartphones generates the most revenue",
x = "Product",
y = "Total Revenue"
) +
geom_text(aes(
label = paste0(dollar_format()(Total_Sales/1000000),"M")),
position = position_stack(vjust = 0.5),
size = 5,
color = "white"))
Question 6: What time should we display advertisements to maximize likelihood of customer’s buying product?
# Add 2 new columns, 'Hour' & 'Minutes' based on the 'Order_Date_Time' column
all_months_data_droppedNAs$Hour <- format(as.POSIXct(all_months_data_droppedNAs$Order_Date_Time), format = "%H")
all_months_data_droppedNAs$Minute <- format(as.POSIXct(all_months_data_droppedNAs$Order_Date_Time), format = "%M")
all_months_data_droppedNAs %>%
group_by(City, Hour) %>%
summarise(Total_Orders = n_distinct(Order_ID)) %>%
ungroup() %>%
ggplot(aes(x = as.numeric(Hour), y = Total_Orders, color = City)) +
geom_line() +
geom_point() +
labs(
title = "Most sales occur between 10AM-1PM or 6PM-8PM",
x = "Hour",
y = "# of Orders"
) +
theme_light()
